Source code for nlp_architect.data.cdc_resources.relations.wikipedia_relation_extraction

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************

from __future__ import division

import logging
import os
from typing import Set, List

from nlp_architect.common.cdc.mention_data import MentionDataLight
from nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_pages import WikipediaPages
from nlp_architect.data.cdc_resources.relations.relation_extraction import RelationExtraction
from nlp_architect.data.cdc_resources.relations.relation_types_enums import RelationType, \
    WikipediaSearchMethod
from nlp_architect.data.cdc_resources.wikipedia.wiki_elastic import WikiElastic
from nlp_architect.data.cdc_resources.wikipedia.wiki_offline import WikiOffline
from nlp_architect.data.cdc_resources.wikipedia.wiki_online import WikiOnline
from nlp_architect.utils.string_utils import StringUtils

logger = logging.getLogger(__name__)


[docs]class WikipediaRelationExtraction(RelationExtraction): def __init__(self, method: WikipediaSearchMethod = WikipediaSearchMethod.ONLINE, wiki_file: str = None, host: str = None, port: int = None, index: str = None, filter_pronouns: bool = True, filter_time_data: bool = True) -> None: """ Extract Relation between two mentions according to Wikipedia knowledge Args: method (optional): WikipediaSearchMethod.{ONLINE/OFFLINE/ELASTIC} run against wiki site a sub-set of wiki or on a local elastic database (default = ONLINE) wiki_file (required on OFFLINE mode): str Location of Wikipedia file to work with host (required on Elastic mode): str the Elastic search host name port (required on Elastic mode): int the Elastic search port number index (required on Elastic mode): int the Elastic search index name """ logger.info('Loading Wikipedia module') self.filter_pronouns = filter_pronouns self.filter_time_data = filter_time_data connectivity = method if connectivity == WikipediaSearchMethod.ONLINE: self.pywiki_impl = WikiOnline() elif connectivity == WikipediaSearchMethod.OFFLINE: if wiki_file is not None and os.path.isdir(wiki_file): self.pywiki_impl = WikiOffline(wiki_file) else: raise FileNotFoundError('Wikipedia resource file not found or not in path, ' 'create it or change the initialization method') elif connectivity == WikipediaSearchMethod.ELASTIC: self.pywiki_impl = WikiElastic(host, port, index) logger.info('Wikipedia module lead successfully') super(WikipediaRelationExtraction, self).__init__()
[docs] def extract_all_relations(self, mention_x: MentionDataLight, mention_y: MentionDataLight) -> Set[RelationType]: """ Try to find if mentions has anyone or more of the relations this class support Args: mention_x: MentionDataLight mention_y: MentionDataLight Returns: Set[RelationType]: One or more of: RelationType.WIKIPEDIA_BE_COMP, RelationType.WIKIPEDIA_TITLE_PARENTHESIS, RelationType.WIKIPEDIA_DISAMBIGUATION, RelationType.WIKIPEDIA_CATEGORY, RelationType.WIKIPEDIA_REDIRECT_LINK, RelationType.WIKIPEDIA_ALIASES, RelationType.WIKIPEDIA_PART_OF_SAME_NAME """ relations = set() mention1_str = mention_x.tokens_str.strip() mention2_str = mention_y.tokens_str.strip() if self.filter_pronouns: if self.is_both_opposite_personal_pronouns(mention1_str, mention2_str): relations.add(RelationType.NO_RELATION_FOUND) return relations if self.filter_time_data: if self.is_both_data_or_time(mention_x, mention_y): relations.add(RelationType.NO_RELATION_FOUND) return relations pages1 = self.get_phrase_related_pages(mention1_str) pages2 = self.get_phrase_related_pages(mention2_str) # check if search phrase is empty meaning it is probably a stop word if pages1.is_empty_norm_phrase or pages2.is_empty_norm_phrase: relations.add(RelationType.NO_RELATION_FOUND) return relations if self.is_redirect_same(pages1, pages2): relations.add(RelationType.WIKIPEDIA_REDIRECT_LINK) titles1 = pages1.get_and_set_titles() titles1.add(mention1_str + ' ' + mention2_str) titles1.add(mention2_str + ' ' + mention1_str) titles2 = pages2.get_and_set_titles() titles2.add(mention1_str + ' ' + mention2_str) titles2.add(mention2_str + ' ' + mention1_str) relation_alias = self.extract_aliases(pages1, pages2, titles1, titles2) if relation_alias is not RelationType.NO_RELATION_FOUND: relations.add(relation_alias) relation_dis = self.extract_disambig(pages1, pages2, titles1, titles2) if relation_dis is not RelationType.NO_RELATION_FOUND: relations.add(relation_dis) relation_cat = self.extract_category(pages1, pages2, titles1, titles2) if relation_cat is not RelationType.NO_RELATION_FOUND: relations.add(relation_cat) relation_par = self.extract_parenthesis(pages1, pages2, titles1, titles2) if relation_par is not RelationType.NO_RELATION_FOUND: relations.add(relation_par) relation_be = self.extract_be_comp(pages1, pages2, titles1, titles2) if relation_be is not RelationType.NO_RELATION_FOUND: relations.add(relation_be) if len(relations) == 0: relations.add(RelationType.NO_RELATION_FOUND) return relations
[docs] def extract_sub_relations(self, mention_x: MentionDataLight, mention_y: MentionDataLight, relation: RelationType) -> RelationType: """ Check if input mentions has the given relation between them Args: mention_x: MentionDataLight mention_y: MentionDataLight relation: RelationType Returns: RelationType: relation in case mentions has given relation or RelationType.NO_RELATION_FOUND otherwise """ mention1_str = mention_x.tokens_str.strip() mention2_str = mention_y.tokens_str.strip() if self.filter_pronouns: if self.is_both_opposite_personal_pronouns(mention1_str, mention2_str): return RelationType.NO_RELATION_FOUND if self.filter_time_data: if self.is_both_data_or_time(mention_x, mention_y): return RelationType.NO_RELATION_FOUND pages1 = self.get_phrase_related_pages(mention1_str) pages2 = self.get_phrase_related_pages(mention2_str) # check if search phrase is empty meaning it is probably a stop word if pages1.is_empty_norm_phrase or pages2.is_empty_norm_phrase: return RelationType.NO_RELATION_FOUND if relation == RelationType.WIKIPEDIA_REDIRECT_LINK: if self.is_redirect_same(pages1, pages2): return RelationType.WIKIPEDIA_REDIRECT_LINK return RelationType.NO_RELATION_FOUND titles1 = pages1.get_and_set_titles() titles1.add(mention1_str + ' ' + mention2_str) titles1.add(mention2_str + ' ' + mention1_str) titles2 = pages2.get_and_set_titles() titles2.add(mention1_str + ' ' + mention2_str) titles2.add(mention2_str + ' ' + mention1_str) if relation == RelationType.WIKIPEDIA_ALIASES: return self.extract_aliases(pages1, pages2, titles1, titles2) if relation == RelationType.WIKIPEDIA_DISAMBIGUATION: return self.extract_disambig(pages1, pages2, titles1, titles2) if relation == RelationType.WIKIPEDIA_CATEGORY: return self.extract_category(pages1, pages2, titles1, titles2) if relation == RelationType.WIKIPEDIA_TITLE_PARENTHESIS: return self.extract_parenthesis(pages1, pages2, titles1, titles2) if relation == RelationType.WIKIPEDIA_BE_COMP: return self.extract_be_comp(pages1, pages2, titles1, titles2) return RelationType.NO_RELATION_FOUND
[docs] @staticmethod def extract_be_comp(pages1: WikipediaPages, pages2: WikipediaPages, titles1: Set[str], titles2: Set[str]) -> RelationType: """ Check if input mentions has be-comp/is-a relation Args: pages1: WikipediaPages pages2: WikipediaPage titles1: Set[str] titles2: Set[str] Returns: RelationType.WIKIPEDIA_BE_COMP or RelationType.NO_RELATION_FOUND """ relation = RelationType.NO_RELATION_FOUND if bool(pages1.get_and_set_be_comp() & titles2): relation = RelationType.WIKIPEDIA_BE_COMP elif bool(pages2.get_and_set_be_comp() & titles1): relation = RelationType.WIKIPEDIA_BE_COMP return relation
[docs] @staticmethod def extract_parenthesis(pages1: WikipediaPages, pages2: WikipediaPages, titles1: Set[str], titles2: Set[str]) -> RelationType: """ Check if input mentions has parenthesis relation Args: pages1: WikipediaPages pages2: WikipediaPage titles1: Set[str] titles2: Set[str] Returns: RelationType.WIKIPEDIA_TITLE_PARENTHESIS or RelationType.NO_RELATION_FOUND """ relation = RelationType.NO_RELATION_FOUND if bool(pages1.get_and_set_parenthesis() & titles2): relation = RelationType.WIKIPEDIA_TITLE_PARENTHESIS elif bool(pages2.get_and_set_parenthesis() & titles1): relation = RelationType.WIKIPEDIA_TITLE_PARENTHESIS return relation
[docs] @staticmethod def extract_category(pages1: WikipediaPages, pages2: WikipediaPages, titles1: Set[str], titles2: Set[str]) -> RelationType: """ Check if input mentions has category relation Args: pages1: WikipediaPages pages2: WikipediaPage titles1: Set[str] titles2: Set[str] Returns: RelationType.WIKIPEDIA_CATEGORY or RelationType.NO_RELATION_FOUND """ relation = RelationType.NO_RELATION_FOUND if bool(pages1.get_and_set_all_categories() & titles2): relation = RelationType.WIKIPEDIA_CATEGORY elif bool(pages2.get_and_set_all_categories() & titles1): relation = RelationType.WIKIPEDIA_CATEGORY return relation
[docs] @staticmethod def extract_disambig(pages1: WikipediaPages, pages2: WikipediaPages, titles1: Set[str], titles2: Set[str]) -> RelationType: """ Check if input mentions has disambiguation relation Args: pages1: WikipediaPages pages2: WikipediaPage titles1: Set[str] titles2: Set[str] Returns: RelationType.WIKIPEDIA_DISAMBIGUATION or RelationType.NO_RELATION_FOUND """ relation = RelationType.NO_RELATION_FOUND if bool(pages1.get_and_set_all_disambiguation() & titles2): relation = RelationType.WIKIPEDIA_DISAMBIGUATION elif bool(pages2.get_and_set_all_disambiguation() & titles1): relation = RelationType.WIKIPEDIA_DISAMBIGUATION return relation
[docs] @staticmethod def extract_aliases(pages1: WikipediaPages, pages2: WikipediaPages, titles1: Set[str], titles2: Set[str]) -> RelationType: """ Check if input mentions has aliases relation Args: pages1: WikipediaPages pages2: WikipediaPage titles1: Set[str] titles2: Set[str] Returns: RelationType.WIKIPEDIA_ALIASES or RelationType.NO_RELATION_FOUND """ relation = RelationType.NO_RELATION_FOUND if bool(pages1.get_and_set_all_aliases() & titles2): relation = RelationType.WIKIPEDIA_ALIASES elif bool(pages2.get_and_set_all_aliases() & titles1): relation = RelationType.WIKIPEDIA_ALIASES return relation
[docs] def is_part_of_same_name(self, pages1: WikipediaPages, pages2: WikipediaPages) -> bool: """ Check if input mentions has part of same name relation (eg: page1=John, page2=Smith) Args: pages1: WikipediaPages pages2: WikipediaPage Returns: bool """ for page1 in pages1.pages: for page2 in pages2.pages: if page1.relations.is_part_name and page2.relations.is_part_name: pages = self.pywiki_impl.get_pages(page1.orig_phrase + ' ' + page2.orig_phrase) for page in pages: if page.page_result.pageid != 0: return True return False
[docs] @staticmethod def is_redirect_same(pages1: WikipediaPages, pages2: WikipediaPages) -> bool: """ Check if input mentions has same wikipedia redirect page Args: pages1: WikipediaPages pages2: WikipediaPage Returns: bool """ for page1 in pages1.get_pages(): for page2 in pages2.get_pages(): if page1.pageid > 0 and page2.pageid > 0: if page1.pageid == page2.pageid: return True return False
[docs] @staticmethod def get_supported_relations() -> List[RelationType]: """ Return all supported relations by this class Returns: List[RelationType] """ return [RelationType.WIKIPEDIA_BE_COMP, RelationType.WIKIPEDIA_TITLE_PARENTHESIS, RelationType.WIKIPEDIA_DISAMBIGUATION, RelationType.WIKIPEDIA_CATEGORY, RelationType.WIKIPEDIA_REDIRECT_LINK, RelationType.WIKIPEDIA_ALIASES, RelationType.WIKIPEDIA_PART_OF_SAME_NAME]
[docs] @staticmethod def is_both_opposite_personal_pronouns(phrase1: str, phrase2: str) -> bool: """ check if both phrases refers to pronouns Returns: bool """ result = False if StringUtils.is_pronoun(phrase1.lower()) and StringUtils.is_pronoun(phrase2.lower()): result = True return result
[docs] @staticmethod def is_both_data_or_time(mention1: MentionDataLight, mention2: MentionDataLight) -> bool: """ check if both phrases refers to time or date Returns: bool """ mention1_ner = mention1.mention_ner mention2_ner = mention2.mention_ner if mention1_ner is None: _, _, _, mention1_ner = StringUtils.find_head_lemma_pos_ner(mention1.tokens_str) if mention2_ner is None: _, _, _, mention2_ner = StringUtils.find_head_lemma_pos_ner(mention2.tokens_str) is1_time_or_data = 'DATE' in mention1_ner or 'TIME' in mention1_ner is2_time_or_data = 'DATE' in mention2_ner or 'TIME' in mention2_ner result = False if is1_time_or_data and is2_time_or_data: result = True return result